'''
This script replicates Table 1 from Zhang and Pan 2019.  That table compares CASM to other China event datasets.  This will compare Venezuela, Spain, Hong Kong, South Korea, and Pakistan to our data.
'''


#################################################
#
#		GLOBALS
#
#################################################
import pandas as pd
import os
import unidecode  # Get rid of diacritics
from fuzzywuzzy import fuzz, process  # String matching. See https://www.datacamp.com/community/tutorials/fuzzy-string-python

#os.chdir('<path/to/Replication/>')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


#################################################
#
#		READ IN DATA
#
#################################################
######
# TWITTER
######
twitter = pd.read_csv('./Data/02_processedData/f_eParsedForMainRegressions.csv')  # After subsetting to get cities occurring often.

######
# OTHERS
######
acled = pd.read_csv('./Data/eventDataComparison/eventData_CASMTable1_acled.csv')
icews = pd.read_csv('./Data/eventDataComparison/eventData_CASMTable1_icews.csv')
mmad = pd.read_csv('./Data/eventDataComparison/eventData_CASMTable1_mmad.csv')

######
# CLEAN
######
### Make column names will need for merging the same.  Want Country-State-City-Event_Date
twitter = twitter.rename(columns={'country': 'Country', 'city_use': 'City', 'day': 'Event_date', 'rg.state': 'State'})
acled = acled.rename(columns={'country': 'Country', 'admin1': 'State', 'location': 'City', 'event_date': 'Event_date'})
icews = icews.rename(columns={'Province': 'State', 'Event_Date': 'Event_date'})
mmad = mmad.rename(columns={'country': 'Country', 'asciiname': 'City', 'event_date': 'Event_date'})  # NB: MMAD does not have a state variable


### Will keep track of which datasets have
twitter['Source'] = 'Twitter'
acled['Source'] = 'ACLED'
icews['Source'] = 'ICEWS'
mmad['Source'] = 'MMAD'

### Make country names match
twitter['Country'][twitter['Country'] == 'ES'] = 'Spain'
twitter['Country'][twitter['Country'] == 'HK'] = 'Hong Kong'
twitter['Country'][twitter['Country'] == 'KR'] = 'South Korea'
twitter['Country'][twitter['Country'] == 'PK'] = 'Pakistan'
twitter['Country'][twitter['Country'] == 'VE1'] = 'Venezuela'
twitter['Country'][twitter['Country'] == 'VE2'] = 'Venezuela'

mmad['Country'][mmad['Country'] == 'China'] = 'Hong Kong'


# Need strings, there are some nan values
twitter['City'] = [unidecode.unidecode(str(item)) for item in twitter['City']]
acled['City'] = [unidecode.unidecode(str(item)) for item in acled['City']]
icews['City'] = [unidecode.unidecode(str(item)) for item in icews['City']]
mmad['City'] = [unidecode.unidecode(str(item)) for item in mmad['City']]

# Many ICEWS cities are nan, so let us remove.
icews = icews[icews['City'] != 'nan']

# Make protest variable
acled['protest'] = 1
icews['protest'] = 1
mmad['protest'] = 1


#################################################
#
#		SEE IF NEED TO CHANGE PLACE NAMES TO MATCH
#
#################################################
tw_cities = set(twitter['City'])
a_cities = set(acled['City'])
i_cities = set(icews['City'])
m_cities = set(mmad['City'])


matches_a = {}
for item in a_cities:
	matches_a[item] = process.extractOne(item, tw_cities)  # Gets lahore, which is only match
# Result: only matches of 100 are correct

matches_i = {}
for item in i_cities:
	matches_i[item] = process.extractOne(item, tw_cities)
# Result: only matches of 100 are correct

matches_m = {}
for item in m_cities:
	matches_m[item] = process.extractOne(item, tw_cities)
# Only matches of 100 are correct.  But remember Hong Kong as city for MMAD is not split smaller.


## Expect Twitter will have more.
a_cities.difference(tw_cities)  # Lots more in ACLED for Pakistan
i_cities.difference(tw_cities)  #
m_cities.difference(tw_cities)  # Remember that MMAD does not break out HK


## Update names
acled['City'] = acled['City'].str.replace('Catia', 'Catia la Mar')

icews['City_Original'] = icews['City']  # Create this column so I can compare ICEWS with itself and with changes I make

## Update ICEWS for Venezuela
icews['City'] = icews['City'].str.replace('Bello Monte', 'Caracas')
icews['City'] = icews['City'].str.replace('Altamira', 'Caracas')  # ICEWS codes as the small town in Trujillo, but I believe they mean the neighborhood of Caracas.  Since I don't have that fine of resolution, I will make it Caracas.
icews['City'] = icews['City'].str.replace('El Paraiso', 'Caracas')
icews['City'] = icews['City'].str.replace('Los Palos Grandes', 'Caracas')
icews['City'] = icews['City'].str.replace('Los Ruices', 'Caracas')

icews['State'] = icews['State'].str.replace('Distrito Capital', 'Capital')
icews['State'] = icews['State'].str.replace('Estado ', '')

## Update ICEWS city, state for Hong Kong
icews['City'] = icews['City'].str.replace('Admiralty', 'Central')
icews['City'] = icews['City'].str.replace('Mongkok', 'Kowloon')
icews['City'] = icews['City'].str.replace('Tsim Sha Tsui', 'Kowloon')

# These name updates are to match the name updates I made in Twitter.
icews['City'] = icews['City'].str.replace('Sha Tin', 'Central')
icews['City'] = icews['City'].str.replace('Tsuen Wan', 'Central')
icews['City'] = icews['City'].str.replace('Wan Chai', 'Central')

icews['State'] = icews['State'].replace('Sha Tin', 'Central and Western')
icews['State'] = icews['State'].replace('Central and Western District', 'Central and Western')
icews['State'] = icews['State'].replace('Wan Chai', 'Central and Western')
icews['State'] = icews['State'].replace('Yau Tsim Mong', 'Kowloon City')
icews['State'] = icews['State'].replace('Hong Kong Island', 'Central and Western')

## Update ICEWS city, state for South Korea
icews['State'] = icews['State'].replace('Gyeonggi Province', 'Seoul')  # Gyeonggi-do from Twitter
icews['State'] = icews['State'].replace('Busan', 'Gyeongsangnam-do')  # Busan is the city in Gyeongsangnam
icews['State'] = icews['State'].replace('Seoul-teukbyeolsi', 'Seoul')
icews['State'] = icews['State'].replace('Incheon', 'Seoul')  # Incheon is not a state
icews['City'] = icews['City'].replace('Seongnam', 'Seoul')

## Update ICEWS location for Catalonia
icews['State'] = icews['State'].replace('Catalunya', 'Catalonia')
icews['City'] = icews['City'].replace('Montjuic', 'Barcelona')

mmad['City'] = mmad['City'].str.replace('Merida', 'Mérida')
mmad['City'] = mmad['City'].str.replace('Altamira', 'Caracas')
mmad['City'] = mmad['City'].str.replace('Campo de Carabobo', 'Tocuyito')  # https://es.wikipedia.org/wiki/Campo_de_Carabobo
mmad['City'] = mmad['City'].str.replace('San Antonio de Los Altos', 'San Antonio De Los Altos')
mmad['City'] = mmad['City'].str.replace('San Cristobal', 'San Cristóbal')


#################################################
#
#		AGGREGATE THE EVENT DATA
#
#################################################
## Aggregate each event dataset to country-state-city-day.
acled_agg = acled.groupby(['Country', 'State', 'City', 'Event_date']).agg({'protest': 'sum'}).reset_index()
icews_agg = icews.groupby(['Country', 'State', 'City', 'Event_date']).agg({'protest': 'sum'}).reset_index()
mmad_agg = mmad.groupby(['Country', 'City', 'Event_date']).agg({'protest': 'sum'}).reset_index()  # MMAD does not have state


#################################################
#
#		MERGE
#			Will compare each dataset to Twitter one at a time, should be easier to inspect the name discrepancies.  Merging all at once risks too much errors on my part.
#################################################

######
# ACLED
######
twitter_temp = twitter[twitter['Country'] == 'Pakistan']  # ACLED only has PK
twitter_acled = twitter_temp.merge(acled_agg, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)
# Below is for all events
both = twitter_acled[twitter_acled['_merge'] == 'both']  # In both
left = twitter_acled[twitter_acled['_merge'] == 'left_only']  # Only Twitter
right = twitter_acled[twitter_acled['_merge'] == 'right_only']  # ACLED only

# Below is for only Lahore
ta_lahore = twitter_acled[twitter_acled['City'] == 'Lahore']
both = ta_lahore[ta_lahore['_merge'] == 'both']  # In both
left = ta_lahore[ta_lahore['_merge'] == 'left_only']  # Only Twitter
right = ta_lahore[ta_lahore['_merge'] == 'right_only']  # ACLED only

# Percent in ACLED that are in Twitter
both.shape[0]/(both.shape[0] + right.shape[0])
# Percent in Twitter not in ACLED
(left.shape[0])/(both.shape[0]+left.shape[0])


######
# MMAD, HONG KONG
######
twitter_temp = twitter[twitter['Country'] == 'Hong Kong']
mmad_agg_temp = mmad_agg[mmad_agg['Country'] == 'Hong Kong']
twitter_temp['City'] = 'Hong Kong'  # MMAD does not have variation w/in Hong Kong

twitter_mmad = twitter_temp.merge(mmad_agg_temp, how='outer', on=['Country', 'City', 'Event_date'], indicator=True)  # MMAD does not have state

# Below is for all events
both = twitter_mmad[twitter_mmad['_merge'] == 'both']  # In both
left = twitter_mmad[twitter_mmad['_merge'] == 'left_only']  # Only Twitter
right = twitter_mmad[twitter_mmad['_merge'] == 'right_only']  # MMAD only

# Percent in MMAD also in Twitter
both.shape[0]/(both.shape[0] + right.shape[0])


######
# MMAD, VENEZUELA
######
twitter_temp = twitter[twitter['Country'] == 'Venezuela']
mmad_agg_temp = mmad_agg[mmad_agg['Country'] == 'Venezuela']


## Get dates matching.  MMAD does not go into 2016, starts to early for the Twitter data
mmad_agg_temp = mmad_agg_temp[mmad_agg_temp['Event_date'] >= '2014-10-22']  # start of Twitter data for VE
mmad_agg_temp = mmad_agg_temp[mmad_agg_temp['Event_date'] <= '2015-02-02']  # end of Twitter date for VE

twitter_mmad = twitter_temp.merge(mmad_agg_temp, how='outer', on=['Country', 'City', 'Event_date'], indicator=True)  # MMAD does not have state

# Below is for all events
both = twitter_mmad[twitter_mmad['_merge'] == 'both']  # In both
left = twitter_mmad[twitter_mmad['_merge'] == 'left_only']  # Only Twitter
right = twitter_mmad[twitter_mmad['_merge'] == 'right_only']  # MMAD only

# Percent in MMAD also in Twitter
both.shape[0]/(both.shape[0] + right.shape[0])  #9%
# Percent in Twitter not in MMAD
left.shape[0]/(both.shape[0] + left.shape[0])



######
# ICEWS, PAKISTAN
######
twitter_temp = twitter[twitter['Country'] == 'Pakistan']  # ACLED only has PK
icews_temp = icews_agg[icews_agg['Country'] == 'Pakistan']
twitter_icews = twitter_temp.merge(icews_temp, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)


# Below is for all events
both = twitter_icews[twitter_icews['_merge'] == 'both']  # In both
left = twitter_icews[twitter_icews['_merge'] == 'left_only']  # Only Twitter
right = twitter_icews[twitter_icews['_merge'] == 'right_only']  # ACLED only

# Below is for only Lahore
ta_lahore = twitter_icews[twitter_icews['City'] == 'Lahore']
both = ta_lahore[ta_lahore['_merge'] == 'both']  # In both
left = ta_lahore[ta_lahore['_merge'] == 'left_only']  # Only Twitter
right = ta_lahore[ta_lahore['_merge'] == 'right_only']  # ICEWS only

# Percent in ICEWS also in Twitter
both.shape[0]/(both.shape[0]+right.shape[0])

right['City'].value_counts() # How many locations in ICEWS are NA? 0

######
# ICEWS, HONG KONG
######ß
twitter_temp = twitter[twitter['Country'] == 'Hong Kong']
icews_temp = icews_agg[icews_agg['Country'] == 'Hong Kong']
icews_temp = icews_temp[icews_temp['Event_date'] >= '2014-09-18']
icews_temp = icews_temp[icews_temp['Event_date'] <= '2014-12-31']
twitter_icews = twitter_temp.merge(icews_temp, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)

# Below is for all events
both = twitter_icews[twitter_icews['_merge'] == 'both']  # In both
left = twitter_icews[twitter_icews['_merge'] == 'left_only']  # Only Twitter
right = twitter_icews[twitter_icews['_merge'] == 'right_only']  # ICEWS only, 3 of 23 do not have city location

# Percent in ICEWS also in Twitter
both.shape[0]/(both.shape[0]+right.shape[0])

right['City'].value_counts() # How many locations in ICEWS are NA? 3


######
# ICEWS, SOUTH KOREA
######
twitter_temp = twitter[twitter['Country'] == 'South Korea']
icews_temp = icews_agg[icews_agg['Country'] == 'South Korea']
icews_temp = icews_temp[icews_temp['Event_date'] >= '2016-10-20']
icews_temp = icews_temp[icews_temp['Event_date'] <= '2017-03-14']
twitter_icews = twitter_temp.merge(icews_temp, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)

# Below is for all events
both = twitter_icews[twitter_icews['_merge'] == 'both']  # In both
left = twitter_icews[twitter_icews['_merge'] == 'left_only']  # Only Twitter
right = twitter_icews[twitter_icews['_merge'] == 'right_only']  # ICEWS only.  6 of 24 do not have city

# Percent in ICEWS also in Twitter
both.shape[0]/(both.shape[0]+right.shape[0])

right['City'].value_counts() # How many locations in ICEWS are NA? 6

######
# ICEWS, VENEZUELA
######
twitter_temp = twitter[twitter['Country'] == 'Venezuela']
icews_temp = icews_agg[icews_agg['Country'] == 'Venezuela']
icews_agg_temp = icews_temp[icews_temp['Event_date'] >= '2014-10-22']  # start of Twitter data for VE
icews_agg_temp = icews_agg_temp[icews_agg_temp['Event_date'] <= '2015-02-02']  # end of Twitter date for VE
icews_agg_temp2 = icews_temp[icews_temp['Event_date'] >= '2017-01-01']
icews_agg_temp2 = icews_agg_temp2[icews_agg_temp2['Event_date'] <= '2017-12-31']

icews_temp = icews_agg_temp.append(icews_agg_temp2)

twitter_icews = twitter_temp.merge(icews_temp, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)

# Below is for all events
both = twitter_icews[twitter_icews['_merge'] == 'both']  # In both
left = twitter_icews[twitter_icews['_merge'] == 'left_only']  # Only Twitter
right = twitter_icews[twitter_icews['_merge'] == 'right_only']  # ICEWS only.

# Percent in ICEWS also in Twitter
both.shape[0]/(both.shape[0]+right.shape[0])

right['City'].value_counts() # How many locations in ICEWS are NA? 6


######
# ICEWS, CATALONIA
######
twitter_temp = twitter[twitter['Country'] == 'Spain']
icews_temp = icews_agg[icews_agg['Country'] == 'Spain']
icews_temp = icews_temp[icews_temp['State'] == 'Catalonia']

icews_temp = icews_temp[icews_temp['Event_date'] >= '2017-09-01']  # Do not need less than because I did not collect 2018 ICEWS data

twitter_icews = twitter_temp.merge(icews_temp, how='outer', on=['Country', 'State', 'City', 'Event_date'], indicator=True)

# Below is for all events
both = twitter_icews[twitter_icews['_merge'] == 'both']  # In both
left = twitter_icews[twitter_icews['_merge'] == 'left_only']  # Only Twitter
right = twitter_icews[twitter_icews['_merge'] == 'right_only']  # ICEWS only.  31 of 34 do not have city

# Percent in ICEWS also in Twitter
both.shape[0]/(both.shape[0]+right.shape[0])

